{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "BOwsuGQQY9OL"
   },
   "outputs": [],
   "source": [
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "from tensorflow.keras import regularizers\n",
    "import tensorflow.keras.utils as ku \n",
    "import numpy as np "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "PRnDnCW-Z7qv"
   },
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer()\n",
    "# !wget --no-check-certificate \\\n",
    "#     https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sonnets.txt \\\n",
    "#     -O sonnets.txt\n",
    "data = open('../../tensorflow_datasets/sonnets.txt').read()\n",
    "\n",
    "corpus = data.lower().split(\"\\n\")\n",
    "\n",
    "\n",
    "tokenizer.fit_on_texts(corpus)\n",
    "total_words = len(tokenizer.word_index) + 1\n",
    "\n",
    "# create input sequences using list of tokens\n",
    "input_sequences = []\n",
    "for line in corpus:\n",
    "\ttoken_list = tokenizer.texts_to_sequences([line])[0]\n",
    "\tfor i in range(1, len(token_list)):\n",
    "\t\tn_gram_sequence = token_list[:i+1]\n",
    "\t\tinput_sequences.append(n_gram_sequence)\n",
    "\n",
    "\n",
    "# pad sequences \n",
    "max_sequence_len = max([len(x) for x in input_sequences])\n",
    "input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))\n",
    "\n",
    "# create predictors and label\n",
    "predictors, label = input_sequences[:,:-1],input_sequences[:,-1]\n",
    "\n",
    "label = ku.to_categorical(label, num_classes=total_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "w9vH8Y59ajYL"
   },
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))\n",
    "model.add(Bidirectional(LSTM(150, return_sequences = True)))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(LSTM(100))\n",
    "model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))\n",
    "model.add(Dense(total_words, activation='softmax'))\n",
    "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "print(model.summary())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "AIg2f1HBxqof"
   },
   "outputs": [],
   "source": [
    " history = model.fit(predictors, label, epochs=100, verbose=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "1fXTEO3GJ282"
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "acc = history.history['accuracy']\n",
    "loss = history.history['loss']\n",
    "\n",
    "epochs = range(len(acc))\n",
    "\n",
    "plt.plot(epochs, acc, 'b', label='Training accuracy')\n",
    "plt.title('Training accuracy')\n",
    "\n",
    "plt.figure()\n",
    "\n",
    "plt.plot(epochs, loss, 'b', label='Training Loss')\n",
    "plt.title('Training loss')\n",
    "plt.legend()\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {},
    "colab_type": "code",
    "id": "6Vc6PHgxa6Hm"
   },
   "outputs": [],
   "source": [
    "seed_text = \"Help me Obi Wan Kenobi, you're my only hope\"\n",
    "next_words = 100\n",
    "  \n",
    "for _ in range(next_words):\n",
    "\ttoken_list = tokenizer.texts_to_sequences([seed_text])[0]\n",
    "\ttoken_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')\n",
    "\tpredicted = model.predict_classes(token_list, verbose=0)\n",
    "\toutput_word = \"\"\n",
    "\tfor word, index in tokenizer.word_index.items():\n",
    "\t\tif index == predicted:\n",
    "\t\t\toutput_word = word\n",
    "\t\t\tbreak\n",
    "\tseed_text += \" \" + output_word\n",
    "print(seed_text)"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "name": "NLP_Week4_Exercise_Shakespeare_Answer.ipynb",
   "provenance": [
    {
     "file_id": "1qmXwwCaT07-qviCiBV65lkus_KvwsyQI",
     "timestamp": 1559583256599
    },
    {
     "file_id": "1V60jn23JMcMpwhF-KvCTYIIfm4J2X6v5",
     "timestamp": 1558728367158
    }
   ],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
